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In [1]: 


import pandas as pd 


In [2]: 


df = pd.read csv('customer acq.csv') 


In [3]: 


df.head() 


out[3]: 


food category 


Breakfast 
Foods 


Breakfast 
Foods 


Breakfast 
Foods 


Breakfast 
Foods 


Breakfast 
Foods 


5 rows x 40 columns 


In [4]: 
df.tail() 
Out[4]: 
food category 
60423 Specialty 
60424 Specialty 
60425 Specialty 
60426 Specialty 
60427 Specialty 


5 rows x 40 columns 


food department food family 


Frozen Foods 


Frozen Foods 


Frozen Foods 


Frozen Foods 


Frozen Foods 


food department 


Carousel 


Carousel 


Carousel 


Carousel 


Carousel 
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Food 


Food 


Food 


Food 


food family 


Non- 
Consumable 


Non- 
Consumable 


Non- 
Consumable 


Non- 
Consumable 


Non- 
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millions) 


7.36 


5.52 


3.68 


3.68 


4.08 


millions) 


2.7232 


2.5944 


1.3616 


1.1776 


1.4280 


store sales(in store cost(in unit sales(in 


millions) 1 


4 


store sales(in store cost(in unit sales( 
millions) 


2.76 


1.60 


5.52 


8.28 


9.20 


millions) 


1.3248 


0.4960 


2.5392 


2.5668 


4.2320 


million 
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In [5]: 


df.shape 


out[5]: 


(60428, 40) 


In [6]: 


df.columns 


out[6]: 


Index([ "food category", "food department', 'food family", 
"store sales(in millions)', "store cost(in millions)", 
"unit sales(in millions)', "promotion name', 'sales country", 
‘marital status', 'gender', 'total children', 'education', 
'member card', 'occupation', 'houseowner', 'avg cars at home(appro 


x)', 

'avg. yearly income', 'num children at home', 

"avg cars at home(approx).1', "brand name', 'SRP', 'gross weight", 

"net weight', ‘recyclable package', 'low fat', "units per case', 

"store type", 'store city', 'store state', 'store sqft', 'grocery s 
qft', 

"frozen sqft', "meat sqft', "coffee bar', "video store', 'salad ba 
p, 

'prepared food', 'florist', 'media type', 'cost'], 

dtype-'object') 

In [8]: 


df.duplicated().sum() 


Out[8]: 


e 
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In [9]: 


df.isnull().sum() 


out[9]: 


food category 

food department 

food family 

store sales(in millions) 
store cost(in millions) 
unit sales(in millions) 
promotion name 

sales country 

marital status 

gender 

total children 
education 

member card 

occupation 

houseowner 

avg cars at home(approx) 
avg. yearly income 

num children at home 
avg cars at home(approx).1 
brand name 

SRP 

gross weight 

net weight 

recyclable package 

low fat 

units per case 

store type 

store city 

store state 

store sqft 

grocery sqft 

frozen sqft 

meat sqft 

coffee bar 

video store 

salad bar 

prepared food 

florist 

media type 

cost 

dtype: int64 
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In [19]: 


df.info() 


«class 'pandas.core.frame.DataFrame' > 
RangeIndex: 60428 entries, Ø to 60427 
Data columns (total 4@ columns): 


# 
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Column 

food cate 
food depa 
food fami 
store sal 
store cos 
unit sale 
promotion 
sales cou 
marital s 
gender 

total chi 
education 
member ca 
occupatio 
houseowne 


avg cars 


avg. year 
num child 


avg cars 


brand nam 
SRP 

gross wei 
net weigh 
recyclabl 
low fat 
units per 
store typ 
store cit 
store sta 
store sqf 
grocery s 
frozen sq 
meat sqft 
coffee ba 
video sto 
salad bar 


prepared 


florist 
media typ 
cost 
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Non-Null Count 


gory 60428 
rtment 60428 
ly 60428 
es(in millions) 60428 
t(in millions) 60428 
s(in millions) 60428 
name 60428 
ntry 60428 
tatus 60428 

60428 
ldren 60428 

60428 
rd 60428 
n 60428 
r 60428 
at home(approx) 60428 
ly income 60428 
ren at home 60428 
at home(approx).1 60428 
e 60428 

60428 
ght 60428 
t 60428 
e package 60428 

60428 
. Case 60428 
e 60428 
y 60428 
te 60428 
t 60428 
qft 60428 
ft 60428 

60428 
r 60428 
re 60428 

60428 
food 60428 

60428 
e 60428 

60428 


non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 


dtypes: float64(6), int64(17), object(17) 
memory usage: 


18.44 MB 
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float64 
float64 
int64 
object 
object 
object 
object 
int64 
object 
object 
object 
object 
int64 
object 
int64 
int64 
object 
float64 
float64 
float64 
int64 
int64 
int64 
object 
object 
object 
int64 
int64 
int64 
int64 
int64 
int64 
int64 
int64 
int64 
object 
float64 
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In [11]: 


df.describe() 


Out[11]: 


count 
mean 


std 


store_sales(in 
millions) 


60428.000000 
6.541031 
3.463047 
0.510000 
3.810000 
5.940000 
8.670000 

22.920000 


8 rows x 23 columns 


store cost(in 
millions) 


60428.000000 
2.619460 
1.453009 
0.163200 
1.500000 
2.385600 
3.484025 
9.726500 
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millions) 
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total children 


avg cars at 
home(approx) 


60428.000000 60428.000000 60428.000000 


3.093169 
0.827677 
1.000000 
3.000000 
3.000000 
4.000000 
6.000000 


2.533875 
1.490165 
0.000000 
1.000000 
3.000000 
4.000000 
5.000000 


2.200271 
1.109644 
0.000000 
1.000000 
2.000000 
3.000000 
4.000000 


num childrei 


60: 
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In [12]: 


df.nunique() 


out [12]: 

food category 45 
food department 22 
food family 3 
store sales(in millions) 1033 
store cost(in millions) 9919 
unit sales(in millions) 6 
promotion name 49 
sales country 3 
marital status 2 
gender 2 
total children 6 
education 5 
member card 4 
occupation 5 
houseowner 2 
avg cars at home(approx) 5 
avg. yearly income 8 
num children at home 6 
avg cars at home(approx).1 5 
brand name 111 
SRP 315 
gross weight 376 
net weight 332 
recyclable package 2 
low fat 2 
units per case 36 
store type 5 
store city 19 
store state 10 
store sqft 20 
grocery sqft 20 
frozen sqft 20 
meat sqft 20 
coffee bar 2 
video store 2 
salad bar 2 
prepared food 2 
florist 2 
media type 13 
cost 328 


dtype: int64 


In [13]: 


obj cols - df.select dtypes(include-['object']).columns 
print('Object columns:', obj cols) 


Object columns: Index(['food category', 'food department', 'food family', 
'promotion name', 
"sales country', 'marital status', 'gender', 'education', "member c 
ard', 
'occupation', 'houseowner', ‘avg. yearly income', "brand name", 
'store type', 'store city', 'store state', 'media type'], 
dtype-'object') 
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In [15]: 


import numpy as np 


In [16]: 


num cols = df.select dtypes(include=np.number).columns 
print( "Numerical columns:', num cols) 


Numerical columns: Index(['store sales(in millions)', 'store cost(in milli 


ons)', 
"unit sales(in millions)', 'total children', "avg cars at home(appr 
ox)', 
'num children at home', 'avg cars at home(approx).1', 'SRP', 
"gross weight', "net weight', 'recvclable package', 'low fat', 
"units per case', 'store sqft', 'grocerv sqft', 'frozen sqft', 
"meat sqft', 'coffee bar', "video store', 'salad bar', "prepared fo 
od', 


'florist', 'cost'], 
dtype-'object') 


In [17]: 


import matplotlib.pyplot as plt 
import seaborn as sns 


In [18]: 


import warnings 
warnings. filterwarnings('ignore') 


In [21]: 

for i in obj cols: 
print(i) 
print(df[i].unique()) 
print('\n') 


food_category 


[ 


S 


'Breakfast Foods' 'Bread' 'Canned Shrimp' 'Baking Goods' 'Vegetables' 
'Frozen Desserts' 'Candy' 'Snack Foods' 'Dairy' 'Starchy Foods' 
'Cleaning Supplies' 'Decongestants' 'Meat' 'Hot Beverages' 

'Jams and Jellies' 'Carbonated Beverages' 'Seafood' 'Specialty' 
'Kitchen Products' 'Electrical' 'Beer and Wine' 'Candles' 'Fruit' 
'Pure Juice Beverages' 'Canned Soup' 'Paper Products' 'Canned Tuna' 
'Eggs' 'Hardware' 'Canned Sardines' 'Canned Clams' 'Pain Relievers' 
'Side Dishes' 'Bathroom Products' 'Magazines' 'Frozen Entrees' 'Pizza' 
"Cold Remedies' 'Canned Anchovies' 'Drinks' 'Hygiene' 'Plastic Product 


'Canned Oysters' 'Packaged Vegetables' 'Miscellaneous'] 


food department 


[ 


'Frozen Foods' 'Baked Goods' 'Canned Foods' 'Baking Goods' 'Produce' 

'Snacks' 'Snack Foods' 'Dairy' 'Starchy Foods' 'Household' 

'Health and Hygiene' 'Meat' 'Beverages' 'Seafood' 'Deli' 

"Alcoholic Beverages' 'Canned Products' 'Eggs' 'Periodicals' v 


rm te a r aeg eL 1 at tA tr 
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In [22]: 
for i in obj cols: 
print(i) 
print(df[i].value counts()) 
print('\n') 
food_category = 
Vegetables 7440 
Snack Foods 6919 
Dairy 3835 
Meat 3107 
Fruit 3080 
Jams and Jellies 2550 
Baking Goods 1947 
Breakfast Foods 1946 
Bread 1797 
Canned Soup 1722 
Beer and Wine 1590 
Paper Products 1568 
Bathroom Products 1552 
Electrical 1544 
Candy 1538 
Frozen Desserts 1446 
Specialty 1174 
Starchy Foods 1103 v 
In [24]: 
for i in obj cols: 
plt.figure(figsize=(15,6)) 
sns.countplot(df[i], data = df, palette = 'his') 
plt.xticks(rotation - 90) 
plt.show() 
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In [25]: 


for i in obj cols: 
plt.figure(figsize-(30,20)) 
plt.pie(df[i].value counts(), labels-df[i].value counts().index, autopct='%1.1f%%', 
hfont = ('fontname':'serif', 'weight': 'bold') 
plt.title(i, size-20, **hfont) 
plt.show() 


food category PN 


Dairy 
Snack Foods 


Jams and Jellies 
Vegetables 


Baking Goods 


Breakfast Foods 


Bread à 
d Tuna 
gestants 
emedies 
Canned Soup Hardware 
Plastic Products 


Drinks 

Frozen Entrees 

Side Dishes 

Cleaning Supplies x 


Tema. Torden Paven wan 


Beer and Wine 


Paper Products 


In [26]: 


for i in num cols: 
plt.figure(figsize=(15,6)) 
sns.histplot(df[i], kde = True, bins = 20, palette = 'his') 
plt.xticks(rotation - 90) 
plt.show() 
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In [27]: 


for i in num cols: 
plt.figure(figsize-(15,6)) 
sns.distplot(df[i], kde = True, bins - 20) 
plt.xticks(rotation - 90) 
plt.show() 


8 
store sales(in millions) 


0.25 


0.20 


v 
In [28]: 
for i in num cols: 
plt.figure(figsize-(15,6)) 
sns.boxplot(df[i], data = df, palette = 'his') 
plt.xticks(rotation = 90) 
plt.show() 
^ 


store sales(in millions) 


localhost:8888/notebooks/customer acq.ipynb 10/15 


4/6/23, 5:34 PM customer acq - Jupyter Notebook 


In [29]: 


for i in num cols: 
plt.figure(figsize=(15,6)) 
sns.violinplot(df[i], data = df, palette = 'his') 
plt.xticks(rotation - 90) 
plt.show() 


- a a a R 
store sales(in millions) 


for i in num cols: for jin num cols: pit figure(figsize=(15,6)) sns.lineplot(x = df[i], y = df[j], data = df, palette = 
'his') pit.xticks(rotation = 90) plt.show() 
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In [31]: 


for i in num cols: 
for j in num cols: 
plt.figure(figsize=(15,6)) 
sns.scatterplot(x = df[i], y = df[j], data = df, palette = 'hls') 
plt.xticks(rotation - 90) 


plt.show() 
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for i in obj cols: for jin num cols: pit figure(figsize=(15,6)) sns.barplot(x = df[i], y = df[j], data = df, palette = 
'his') pit.xticks(rotation = 90) plt.show() 
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In [34]: 


(15,6)) 


igsize 


cols 


in 0 


bj 
plt.figure(f 


i 


for 


sns.barplot(x = df[i], y = df['cost'], data = df, ci = None, palette = 'his') 


90) 


plt.xticks(rotation 


plt.show() 
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In [35]: 


df[num cols].corr() 


df corr 
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In [36]: 
df corr 
out[36]: 
store sales(in store cost(in unit sales(in total children avg cars. 
millions) millions) millions) home(appro 
store_sales(in millions) 1.000000 0.954685 0.503482 0.083313 0.0044: 
store_cost(in millions) 0.954685 1.000000 0.480087 0.079058 0.0028! 
unit sales(in millions) 0.503482 0.480087 1.000000 0.163188 0.0236! 
total children 0.083313 0.079058 0.163188 1.000000 0.0981 
" GOD 0.004498 0.002865 0.023667 0.098110 1.00001 
num children at home 0.032437 0.027576 0.066725 0.394709 0.1308: 
T el 0.004498 0.002865 0.023667 0.098110 1.0000! 
SRP 0.833478 0.795880 -0.002358 0.000545 -0.0079: 
gross weight 0.036179 0.034237 0.001255 -0.000186 0.0045: 
net weight 0.032014 0.030257 0.001137 0.000142 0.0041: 
recyclable package 0.034293 0.030213 0.001599 0.002794 0.0037: 
low fat -0.006134 -0.005976 -0.001129 -0.002824 -0.0043 
units per case -0.010630 -0.009792 0.000084 0.002307 -0.00721 
store_sqft 0.015543 0.017877 0.031464 0.000555 -0.0158 
grocery_sqft 0.010442 0.012884 0.024857 0.018526 -0.0176: 
frozen sqft 0.017886 0.019245 0.030563 -0.026926 -0.0074 
meat_sqft 0.017883 0.019242 0.030557 -0.026923 -0.00741 
coffee_bar -0.029368 -0.027126 -0.057633 0.002836 -0.00271 
video_store 0.019179 0.019252 0.034996 -0.000591 0.0140! 
salad bar 0.031459 0.033206 0.057878 -0.013764 -0.0089; 
prepared food 0.031459 0.033206 0.057878 -0.013764 -0.0089; 
florist 0.030603 0.030929 0.055885 -0.003361 -0.0041: 
cost -0.004621 -0.004162 -0.015015 -0.003900 0.0116: 
23 rows x 23 columns 
> 
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In [38]: 


plt.figure(figsize-(30, 10)) 
matrix - np.triu(df corr) 


sns.heatmap(df corr, annot-True, linewidth-.8, mask=matrix, cmap="rocket"); 


plt.show() 
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